library(tidyverse)
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
-- Attaching packages ---------------------------------- tidyverse 1.2.1.9000 --
v ggplot2 3.2.1     v purrr   0.3.2
v tibble  2.1.3     v dplyr   0.8.3
v tidyr   0.8.3     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
package 㤼㸱ggplot2㤼㸲 was built under R version 3.6.1-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(tidytext)
package 㤼㸱tidytext㤼㸲 was built under R version 3.6.1
library(jsonlite)

Attaching package: 㤼㸱jsonlite㤼㸲

The following object is masked from 㤼㸱package:purrr㤼㸲:

    flatten
metadata <- read_csv("./2020-03-13/all_sources_metadata_2020-03-13.csv")
Parsed with column specification:
cols(
  sha = col_character(),
  source_x = col_character(),
  title = col_character(),
  doi = col_character(),
  pmcid = col_logical(),
  pubmed_id = col_double(),
  license = col_character(),
  abstract = col_character(),
  publish_time = col_double(),
  authors = col_character(),
  journal = col_character(),
  `Microsoft Academic Paper ID` = col_double(),
  `WHO #Covidence` = col_character(),
  has_full_text = col_logical()
)
44397 parsing failures.
 row          col               expected     actual                                               file
1237 pmcid        1/0/T/F/TRUE/FALSE     PMC1054884 './2020-03-13/all_sources_metadata_2020-03-13.csv'
1237 publish_time no trailing characters  Mar 1     './2020-03-13/all_sources_metadata_2020-03-13.csv'
1238 pmcid        1/0/T/F/TRUE/FALSE     PMC1065064 './2020-03-13/all_sources_metadata_2020-03-13.csv'
1238 publish_time no trailing characters  Oct 14    './2020-03-13/all_sources_metadata_2020-03-13.csv'
1239 pmcid        1/0/T/F/TRUE/FALSE     PMC1084330 './2020-03-13/all_sources_metadata_2020-03-13.csv'
.... ............ ...................... .......... ..................................................
See problems(...) for more details.
View(metadata)

metadata %>% 
  count(license, sort = T)

metadata %>% 
  count(publish_time, sort = T)

metadata %>% 
  filter(has_full_text)

metadata %>% 
  filter(!is.na(abstract))
NA

Extracting text fom all teh full papers

json_objects <- dir("./2020-03-13/comm_use_subset/comm_use_subset/", full.names=T) %>% 
  map(read_json)
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
LS0tDQp0aXRsZTogIkNPVklELTE5IE9wZW4gUmVzZWFyY2ggRGF0YXNldCBDaGFsbGVuZ2UgKENPUkQtMTkpIg0Kc3VidGl0bGU6ICJFeHBsb3JhdG9yeSBEYXRhIEFuYWx5c2lzIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KYGBge3J9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkodGlkeXRleHQpDQpsaWJyYXJ5KGpzb25saXRlKQ0KDQoNCm1ldGFkYXRhIDwtIHJlYWRfY3N2KCIuLzIwMjAtMDMtMTMvYWxsX3NvdXJjZXNfbWV0YWRhdGFfMjAyMC0wMy0xMy5jc3YiKQ0KDQpWaWV3KG1ldGFkYXRhKQ0KDQptZXRhZGF0YSAlPiUgDQogIGNvdW50KGxpY2Vuc2UsIHNvcnQgPSBUKQ0KDQptZXRhZGF0YSAlPiUgDQogIGNvdW50KHB1Ymxpc2hfdGltZSwgc29ydCA9IFQpDQoNCm1ldGFkYXRhICU+JSANCiAgZmlsdGVyKGhhc19mdWxsX3RleHQpDQoNCm1ldGFkYXRhICU+JSANCiAgZmlsdGVyKCFpcy5uYShhYnN0cmFjdCkpDQoNCmBgYA0KDQojIEV4dHJhY3RpbmcgdGV4dCBmb20gYWxsIHRlaCBmdWxsIHBhcGVycw0KDQpgYGB7cn0NCg0KanNvbl9vYmplY3RzIDwtIGRpcigiLi8yMDIwLTAzLTEzL2NvbW1fdXNlX3N1YnNldC9jb21tX3VzZV9zdWJzZXQvIiwgZnVsbC5uYW1lcz1UKSAlPiUgDQogIG1hcChyZWFkX2pzb24pDQoNCg0KDQoNCmBgYA0KDQoNCg0KDQoNCg0KDQoNCg0KDQo=